We describe the analyses of the multi-region colorectal adenocarcinoma Set7 (4 biopsies) and Set6 (6 biopsies), for which we have generated new WGS sequencing data (~100x median coverage). The data - purity, single nucleotide variants, copy number calls and clustering results - are available in Supplementary Table 2 (Excel). Results used to produce plots in the paper are available in ./data/original.zip reporting final clustering assignments.

We remark that data for these tumours has been first sequenced, at lower coverage, in Cross W, et al. The evolutionary landscape of colorectal tumorigenesis. Nat Ecol Evol. 2018;2(10):1661–1672.

To implement our analyses we use the following packages:

Please refer to the webpage of each one of them for installation instructions.

Analysis of Set7 (4 biopsies)

We discuss the analysis for Set7, one of the two colorectal samples. The output RDS objects generated by this vignette are:

You need some auxiliary functions: auxiliary_functions.R

# Packages that are required specifically for the analysis
require(CNAqc)
require(mobster)
require(VIBER)
require(dplyr)

# Source a bunch of auxiliary functions
source('auxiliary_functions.R', verbose = FALSE)

Loading the data

We begin loading the data from the CSV files "Set7_mutations.csv" and "Set7_cna.csv", which are released along with the paper.

dir.create("./Set7/")

###########################v
# Load data from CSV files #
############################

# Sample names and purity (from previous analysis we know purity)
Set7_samples = paste0('Set7_', c(55, 57, 59, 62))
Set7_purity = pio:::nmfy(Set7_samples, c(.88, .88, .88, .8))

# We load SNV data for diploid regions that map to CNA segmentes with >500 mutations. 
# These have been precomputed with the functions of the CNAqc package
Set7_mutations = readr::read_csv("./data/Set7_mutations.csv", col_types = readr::cols())

print(Set7_mutations)
#> # A tibble: 52,274 x 28
#>    chr     from     to Set7_55.DP Set7_57.DP Set7_59.DP Set7_62.DP Set7_55.NV
#>    <chr>  <dbl>  <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
#>  1 chr1  8.36e7 8.36e7         73         66         84         82         35
#>  2 chr16 4.84e7 4.84e7         48         67         61         75          7
#>  3 chr16 4.88e7 4.88e7         66         82         63         73          0
#>  4 chr16 4.89e7 4.89e7        105         69         75         94          0
#>  5 chr16 4.92e7 4.92e7         50         61         54         68          0
#>  6 chr16 4.92e7 4.92e7         87         81         88         91          0
#>  7 chr16 4.92e7 4.92e7         94         91         90         91          0
#>  8 chr16 4.92e7 4.92e7         51         34         47         41          5
#>  9 chr16 4.92e7 4.92e7         88        100         88        100          0
#> 10 chr16 4.92e7 4.92e7         86         78         83        106          0
#> # … with 52,264 more rows, and 20 more variables: Set7_57.NV <dbl>,
#> #   Set7_59.NV <dbl>, Set7_62.NV <dbl>, Set7_55_N.VAF <dbl>,
#> #   Set7_57_N.VAF <dbl>, Set7_59_N.VAF <dbl>, Set7_62_N.VAF <dbl>,
#> #   Set7_55.VAF <dbl>, Set7_57.VAF <dbl>, Set7_59.VAF <dbl>, Set7_62.VAF <dbl>,
#> #   alt <chr>, cosmic <chr>, function. <chr>, gene <chr>, mutlocation <chr>,
#> #   ref <chr>, region <chr>, vartype <chr>, patient <chr>

# Load CNA data for all segments (not just diploid)
Set7_CNA = readr::read_csv("./data/Set7_cna.csv", col_types = readr::cols())

print(Set7_CNA)
#> # A tibble: 60 x 20
#>    chr     from length     to Set7_55.minor Set7_55.Major Set7_56.minor
#>    <chr>  <dbl>  <dbl>  <dbl>         <dbl>         <dbl>         <dbl>
#>  1 chr1  1.58e6   1583 2.26e8             1             1             1
#>  2 chr1  2.27e8      3 2.27e8             1             1             1
#>  3 chr1  2.27e8    151 2.48e8             1             1             1
#>  4 chr2  2.80e5   1435 2.42e8             1             1             1
#>  5 chr3  3.86e5    763 1.41e8             1             1             1
#>  6 chr3  1.41e8    204 1.81e8             1             2             1
#>  7 chr3  1.83e8    161 1.97e8             1             1             1
#>  8 chr4  7.20e5     39 6.32e6             1             1             1
#>  9 chr4  7.70e6    794 1.88e8             1             1             1
#> 10 chr5  1.71e6    288 6.51e7             1             1             1
#> # … with 50 more rows, and 13 more variables: Set7_56.Major <dbl>,
#> #   Set7_57.minor <dbl>, Set7_57.Major <dbl>, Set7_58.minor <dbl>,
#> #   Set7_58.Major <dbl>, Set7_59.minor <dbl>, Set7_59.Major <dbl>,
#> #   Set7_60.minor <dbl>, Set7_60.Major <dbl>, Set7_61.minor <dbl>,
#> #   Set7_61.Major <dbl>, Set7_62.minor <dbl>, Set7_62.Major <dbl>

We use the CNAqc package to visualise and visualise the CNA segments and the VAF distribution.

Set7_mapped_calls = map_calls(
  CNA_calls = Set7_CNA,
  mutation_calls = Set7_mutations,
  purities = Set7_purity,
  samples = Set7_samples
)
#>  [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 52274 mutations for 60 CNA segments (60 clonal, 0 subclonal)
#> ✓ Mapped n = 52274 mutations to clonal segments (100% of input)
#>  [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 52274 mutations for 60 CNA segments (60 clonal, 0 subclonal)
#> ✓ Mapped n = 52274 mutations to clonal segments (100% of input)
#>  [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 52274 mutations for 60 CNA segments (60 clonal, 0 subclonal)
#> ✓ Mapped n = 52274 mutations to clonal segments (100% of input)
#>  [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 52274 mutations for 60 CNA segments (60 clonal, 0 subclonal)
#> ✓ Mapped n = 52274 mutations to clonal segments (100% of input)

# Save RDS
saveRDS(Set7_mapped_calls, file = "./Set7/Set7_mapped_calls.rds")

plot_calls(CNAqc_objects = Set7_mapped_calls)
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.

Step one, removing neutral mutations

Fit each sample with MOBSTER using the raw VAF data, save the RDS output linked to this vignette, and plot the fits as (2x2 matrix).

mobster_fits = fit_mobsters(Set7_mutations, Set7_samples)
#>  [ MOBSTER fit ]
#> ✔ Loaded input data, n = 22928.
#> ❯ n = 22928. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n > 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> ℹ MOBSTER fits completed in 28.4s.
#> ── [ MOBSTER ] Set7_55 n = 22928 with k = 1 Beta(s) and a tail ───────────────────────────────────────────────────────────────────
#> ● Clusters: π = 51% [Tail] and 49% [C1], with π > 0.
#> ● Tail [n = 11360, 51%] with alpha = 1.7.
#> ● Beta C1 [n = 11568, 49%] with mean = 0.5.
#> ℹ Score(s): NLL = -20588.26; ICL = -39265.18 (-41116.28), H = 1851.1 (0). Fit converged by MM in 16 steps.
#>  [ MOBSTER fit ]
#> ✔ Loaded input data, n = 21657.
#> ❯ n = 21657. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n > 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> ℹ MOBSTER fits completed in 31.1s.
#> ── [ MOBSTER ] Set7_57 n = 21657 with k = 1 Beta(s) and a tail ───────────────────────────────────────────────────────────────────
#> ● Clusters: π = 65% [Tail] and 35% [C1], with π > 0.
#> ● Tail [n = 13463, 65%] with alpha = 1.4.
#> ● Beta C1 [n = 8194, 35%] with mean = 0.49.
#> ℹ Score(s): NLL = -19421.46; ICL = -36444.05 (-38783.03), H = 2338.97 (0). Fit converged by MM in 21 steps.
#>  [ MOBSTER fit ]
#> ✔ Loaded input data, n = 25104.
#> ❯ n = 25104. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n > 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> ℹ MOBSTER fits completed in 22.9s.
#> ── [ MOBSTER ] Set7_59 n = 25104 with k = 1 Beta(s) and a tail ───────────────────────────────────────────────────────────────────
#> ● Clusters: π = 67% [Tail] and 33% [C1], with π > 0.
#> ● Tail [n = 16463, 67%] with alpha = 1.8.
#> ● Beta C1 [n = 8641, 33%] with mean = 0.49.
#> ℹ Score(s): NLL = -28392.22; ICL = -54891.5 (-56723.65), H = 1832.15 (0). Fit converged by MM in 14 steps.
#>  [ MOBSTER fit ]
#> ✔ Loaded input data, n = 22396.
#> ❯ n = 22396. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n > 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> ℹ MOBSTER fits completed in 23.2s.
#> ── [ MOBSTER ] Set7_62 n = 22396 with k = 1 Beta(s) and a tail ───────────────────────────────────────────────────────────────────
#> ● Clusters: π = 71% [Tail] and 29% [C1], with π > 0.
#> ● Tail [n = 15396, 71%] with alpha = 1.4.
#> ● Beta C1 [n = 7000, 29%] with mean = 0.5.
#> ℹ Score(s): NLL = -20370.75; ICL = -38138.67 (-40681.4), H = 2542.73 (0). Fit converged by MM in 22 steps.

# Save RDS
saveRDS(mobster_fits, file = "./Set7/Set7_mobster_fits.rds")

# Plot a 2x2 figure
ggarrange(plotlist = lapply(mobster_fits, plot), ncol = 2, nrow = 2)

We extract non-tail mutations, i.e., mutations never assigned to a tail.

# From the MOBSTER fits
non_tail_mutations = get_nontail_mutations(mutations = Set7_mutations, mobster_fits)

print(non_tail_mutations)
#> # A tibble: 11,304 x 29
#>    chr     from     to Set7_55.DP Set7_57.DP Set7_59.DP Set7_62.DP Set7_55.NV
#>    <chr>  <dbl>  <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
#>  1 chr1  8.36e7 8.36e7         73         66         84         82         35
#>  2 chr16 7.11e7 7.11e7         69         78         58         68         39
#>  3 chr1  8.38e7 8.38e7         75         71         94         87         32
#>  4 chr1  8.39e7 8.39e7         85         73         86         71         37
#>  5 chr17 4.88e7 4.88e7         73         54         57         98         39
#>  6 chr1  8.40e7 8.40e7         77         66         84         88         27
#>  7 chr17 7.16e7 7.16e7         53         51         68         71         21
#>  8 chr1  8.41e7 8.41e7         76         80         94        103         40
#>  9 chr1  6.81e6 6.81e6         76         82         94        100         27
#> 10 chr1  8.42e7 8.42e7         66         78         88         90         37
#> # … with 11,294 more rows, and 21 more variables: Set7_57.NV <dbl>,
#> #   Set7_59.NV <dbl>, Set7_62.NV <dbl>, Set7_55_N.VAF <dbl>,
#> #   Set7_57_N.VAF <dbl>, Set7_59_N.VAF <dbl>, Set7_62_N.VAF <dbl>,
#> #   Set7_55.VAF <dbl>, Set7_57.VAF <dbl>, Set7_59.VAF <dbl>, Set7_62.VAF <dbl>,
#> #   alt <chr>, cosmic <chr>, function. <chr>, gene <chr>, mutlocation <chr>,
#> #   ref <chr>, region <chr>, vartype <chr>, patient <chr>, id <chr>

We create separate inputs for the mutation read depth (DP, coverage or total number of reads) and the number of reads with the alternative allele (NV, number of variant reads).

# Cluster non-tail mutations with VIBER
DPs = non_tail_mutations %>% select(ends_with('DP'))
NVs = non_tail_mutations %>% select(ends_with('NV'))
  
colnames(DPs) = colnames(NVs) = Set7_samples 

# Coverage data
print(DPs)
#> # A tibble: 11,304 x 4
#>    Set7_55 Set7_57 Set7_59 Set7_62
#>      <dbl>   <dbl>   <dbl>   <dbl>
#>  1      73      66      84      82
#>  2      69      78      58      68
#>  3      75      71      94      87
#>  4      85      73      86      71
#>  5      73      54      57      98
#>  6      77      66      84      88
#>  7      53      51      68      71
#>  8      76      80      94     103
#>  9      76      82      94     100
#> 10      66      78      88      90
#> # … with 11,294 more rows

Cluster remaining read counts

We can now run the variational inference fitting methods implemented in VIBER, to capture mixtures of Binomial distributions from non-tail SNVs.

options(easypar.parallel = FALSE)

# VIBER fit, and save RDS
viber_fit = VIBER::variational_fit(x = NVs, y = DPs, samples = 4, epsilon_conv = 1e-10, K = 5)
#>  [ VIBER - variational fit ]

saveRDS(viber_fit, file = "./Set7/Set7_mobster_viber_fit.rds")

# Plot a 3x2 figure -- raw fit (all clusters)
ggarrange(
  plotlist = plot(
    viber_fit, 
    colors = get_cluster_colors(palettes = 'FantasticFox1', viber_fit)
    ), 
  ncol = 3, 
  nrow = 2)

As we discuss in the paper, we use a heuristic (implemented in VIBER) to filter out some clusters, and re-perform the plots.

# Apply the heuristic, and save another RDS
heuristic_fit = VIBER::choose_clusters(viber_fit, dimensions_cutoff = 1)

saveRDS(heuristic_fit, file = "./Set7/Set7_mobster_viber_fit_heuristics.rds")

# Plot a 3x2 figure -- after the heuristic
ggarrange(
  plotlist = plot(
    heuristic_fit, 
    colors = get_cluster_colors(palettes = 'FantasticFox1', heuristic_fit)
    ), 
  ncol = 3, 
  nrow = 2)

Analysis without MOBSTER

The standard analysis without MOBSTER clusters read counts for all SNVs.

# All mutations with VIBER
DPs = Set7_mutations %>% select(ends_with('DP'))
NVs = Set7_mutations %>% select(ends_with('NV'))

colnames(DPs) = colnames(NVs) = Set7_samples 

# VIBER fit, and save RDS (one run as it takes time, K = 10 is the default as we expect more clusters here)
st_viber_fit = VIBER::variational_fit(x = NVs, y = DPs, samples = 1, epsilon_conv = 1e-8, max_iter = 100)
#>  [ VIBER - variational fit ]

saveRDS(st_viber_fit, file = "./Set7/Set7_standard_fit.rds")

# Plot a 3x2 figure -- before the heuristic
ggarrange(
  plotlist = plot(
    st_viber_fit, 
    colors = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_viber_fit)
    ), 
  ncol = 3, 
  nrow = 2)


# Apply the heuristic
st_heuristic_fit = VIBER::choose_clusters(st_viber_fit, dimensions_cutoff = 1)

saveRDS(st_heuristic_fit, file = "./Set7/Set7_standard_fit_heuristics.rds")

# Plot a 3x2 figure -- after the heuristic
ggarrange(
  plotlist = plot(
    st_heuristic_fit, 
    colors = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_heuristic_fit)
    ), 
  ncol = 3, 
  nrow = 2)

Summary comparive fit plots

We use an auxiliary plot assembly function that places on the top and bottom diagonal two VIBER fits, and MOBSTER fits on the diagonal of the matrix.

Without versus with MOBSTER, without the heuristic.

squareplot(
  mobster_fits = mobster_fits, 
  viber_fit_bottom = st_viber_fit, 
  viber_fit_top = viber_fit, 
  samples_list = Set7_samples,
  colors_bottom = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_viber_fit),
  colors_top = get_cluster_colors(palettes = 'FantasticFox1', viber_fit)
  )

Without versus with MOBSTER, with the heuristic.

squareplot(
  mobster_fits = mobster_fits,
  viber_fit_bottom = st_heuristic_fit,
  viber_fit_top = heuristic_fit,
  samples_list = Set7_samples,
  colors_bottom = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_heuristic_fit),
  colors_top = get_cluster_colors(palettes = 'FantasticFox1', heuristic_fit)
)

Analysis of Set6

The output RDS objects generated by this vignette are:

dir.create("./Set6/")

# Load sample names and purity (from previous analysis, we know purity)
Set6_samples = paste0('Set6_', c(42, 44, 45:48))
Set6_purity = pio:::nmfy(Set6_samples, c(0.66, 0.72, 0.80, 0.80, 0.80, 0.80))

# Load SNV and CNA data, as for Set7
Set6_mutations = read_csv('./data/Set6_mutations.csv')
#> Parsed with column specification:
#> cols(
#>   .default = col_double(),
#>   id = col_character(),
#>   chr = col_character(),
#>   alt = col_character(),
#>   cosmic = col_logical(),
#>   function. = col_character(),
#>   gene = col_character(),
#>   mutlocation = col_character(),
#>   ref = col_character(),
#>   region = col_character(),
#>   vartype = col_character()
#> )
#> See spec(...) for full column specifications.
Set6_CNA = read_csv('./data/Set6_cna.csv')
#> Parsed with column specification:
#> cols(
#>   chr = col_character(),
#>   from = col_double(),
#>   length = col_double(),
#>   to = col_double(),
#>   Set6_42.minor = col_double(),
#>   Set6_42.Major = col_double(),
#>   Set6_44.minor = col_double(),
#>   Set6_44.Major = col_double(),
#>   Set6_45.minor = col_double(),
#>   Set6_45.Major = col_double(),
#>   Set6_46.minor = col_double(),
#>   Set6_46.Major = col_double(),
#>   Set6_47.minor = col_double(),
#>   Set6_47.Major = col_double(),
#>   Set6_48.minor = col_double(),
#>   Set6_48.Major = col_double()
#> )

Set6_mapped_calls = map_calls(
  CNA_calls = Set6_CNA,
  mutation_calls = Set6_mutations,
  purities = Set6_purity,
  samples = Set6_samples
)
#>  [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 28840 mutations for 85 CNA segments (85 clonal, 0 subclonal)
#> ✓ Mapped n = 28840 mutations to clonal segments (100% of input)
#>  [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 28840 mutations for 85 CNA segments (85 clonal, 0 subclonal)
#> ✓ Mapped n = 28840 mutations to clonal segments (100% of input)
#>  [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 28840 mutations for 85 CNA segments (85 clonal, 0 subclonal)
#> ✓ Mapped n = 28840 mutations to clonal segments (100% of input)
#>  [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 28840 mutations for 85 CNA segments (85 clonal, 0 subclonal)
#> ✓ Mapped n = 28840 mutations to clonal segments (100% of input)
#>  [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 28840 mutations for 85 CNA segments (85 clonal, 0 subclonal)
#> ✓ Mapped n = 28840 mutations to clonal segments (100% of input)
#>  [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 28840 mutations for 85 CNA segments (85 clonal, 0 subclonal)
#> ✓ Mapped n = 28840 mutations to clonal segments (100% of input)
saveRDS(Set6_mapped_calls, file = "./Set6/Set6_mapped_calls.rds")

# MOBSTER fits
mobster_fits = fit_mobsters(Set6_mutations, Set6_samples)
#>  [ MOBSTER fit ]
#> ✔ Loaded input data, n = 13507.
#> ❯ n = 13507. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n > 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> [easypar] 2020-04-22 09:51:47 - Overriding parallel execution setup [FALSE] with global option : FALSE
#> ℹ MOBSTER fits completed in 15.2s.
#> ── [ MOBSTER ] Set6_42 n = 13507 with k = 1 Beta(s) and a tail ───────────────────────────────────────────────────────────────────
#> ● Clusters: π = 56% [Tail] and 44% [C1], with π > 0.
#> ● Tail [n = 7225, 56%] with alpha = 1.6.
#> ● Beta C1 [n = 6282, 44%] with mean = 0.52.
#> ℹ Score(s): NLL = -9175.91; ICL = -16337.63 (-18294.76), H = 1957.13 (0). Fit converged by MM in 25 steps.
#>  [ MOBSTER fit ]
#> ✔ Loaded input data, n = 13985.
#> ❯ n = 13985. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n > 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> [easypar] 2020-04-22 09:52:02 - Overriding parallel execution setup [FALSE] with global option : FALSE
#> ℹ MOBSTER fits completed in 15.4s.
#> ── [ MOBSTER ] Set6_44 n = 13985 with k = 1 Beta(s) and a tail ───────────────────────────────────────────────────────────────────
#> ● Clusters: π = 60% [Tail] and 40% [C1], with π > 0.
#> ● Tail [n = 7942, 60%] with alpha = 1.5.
#> ● Beta C1 [n = 6043, 40%] with mean = 0.48.
#> ℹ Score(s): NLL = -10582.06; ICL = -18955.39 (-21106.84), H = 2151.45 (0). Fit converged by MM in 27 steps.
#>  [ MOBSTER fit ]
#> ✔ Loaded input data, n = 13424.
#> ❯ n = 13424. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n > 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> [easypar] 2020-04-22 09:52:18 - Overriding parallel execution setup [FALSE] with global option : FALSE
#> ℹ MOBSTER fits completed in 12.4s.
#> ── [ MOBSTER ] Set6_45 n = 13424 with k = 1 Beta(s) and a tail ───────────────────────────────────────────────────────────────────
#> ● Clusters: π = 54% [Tail] and 46% [C1], with π > 0.
#> ● Tail [n = 6894, 54%] with alpha = 1.7.
#> ● Beta C1 [n = 6530, 46%] with mean = 0.46.
#> ℹ Score(s): NLL = -11703.23; ICL = -21837.51 (-23349.43), H = 1511.92 (0). Fit converged by MM in 11 steps.
#>  [ MOBSTER fit ]
#> ✔ Loaded input data, n = 14366.
#> ❯ n = 14366. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n > 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> [easypar] 2020-04-22 09:52:30 - Overriding parallel execution setup [FALSE] with global option : FALSE
#> ℹ MOBSTER fits completed in 13.9s.
#> ── [ MOBSTER ] Set6_46 n = 14366 with k = 1 Beta(s) and a tail ───────────────────────────────────────────────────────────────────
#> ● Clusters: π = 61% [Tail] and 39% [C1], with π > 0.
#> ● Tail [n = 8390, 61%] with alpha = 1.5.
#> ● Beta C1 [n = 5976, 39%] with mean = 0.5.
#> ℹ Score(s): NLL = -12186.69; ICL = -22744.16 (-24315.95), H = 1571.8 (0). Fit converged by MM in 18 steps.
#>  [ MOBSTER fit ]
#> ✔ Loaded input data, n = 14591.
#> ❯ n = 14591. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n > 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> [easypar] 2020-04-22 09:52:44 - Overriding parallel execution setup [FALSE] with global option : FALSE
#> ℹ MOBSTER fits completed in 13.9s.
#> ── [ MOBSTER ] Set6_47 n = 14591 with k = 1 Beta(s) and a tail ───────────────────────────────────────────────────────────────────
#> ● Clusters: π = 59% [Tail] and 41% [C1], with π > 0.
#> ● Tail [n = 8170, 59%] with alpha = 1.6.
#> ● Beta C1 [n = 6421, 41%] with mean = 0.47.
#> ℹ Score(s): NLL = -12678.96; ICL = -23542.06 (-25300.4), H = 1758.34 (0). Fit converged by MM in 20 steps.
#>  [ MOBSTER fit ]
#> ✔ Loaded input data, n = 13859.
#> ❯ n = 13859. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n > 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> [easypar] 2020-04-22 09:52:58 - Overriding parallel execution setup [FALSE] with global option : FALSE
#> ℹ MOBSTER fits completed in 14.6s.
#> ── [ MOBSTER ] Set6_48 n = 13859 with k = 1 Beta(s) and a tail ───────────────────────────────────────────────────────────────────
#> ● Clusters: π = 56% [Tail] and 44% [C1], with π > 0.
#> ● Tail [n = 7415, 56%] with alpha = 1.5.
#> ● Beta C1 [n = 6444, 44%] with mean = 0.47.
#> ℹ Score(s): NLL = -10868.76; ICL = -19757.45 (-21680.3), H = 1922.85 (0). Fit converged by MM in 19 steps.
saveRDS(mobster_fits, file = "./Set6/Set6_mobster_fits.rds")

# Non tails
non_tail_mutations = get_nontail_mutations(mutations = Set6_mutations, mobster_fits)

# VIBER, plus heuristic
DPs = non_tail_mutations %>% select(ends_with('DP'))
NVs = non_tail_mutations %>% select(ends_with('NV'))
colnames(DPs) = colnames(NVs) = Set6_samples 

viber_fit = VIBER::variational_fit(x = NVs, y = DPs, samples = 4, epsilon_conv = 1e-10, K = 5)
#>  [ VIBER - variational fit ]
#> ℹ Input n = 6237, with k < 5. Dirichlet concentration α = 1e-06.
#> ℹ Beta (a_0, b_0) = (1, 1); q_i = prior. Optimise: ε = 1e-10 or 5000 steps, r = 4 starts.
#> [easypar] 2020-04-22 09:53:16 - Overriding parallel execution setup [TRUE] with global option : FALSE
#> ✓ VIBER fit completed in 0.13 mins (status: converged)
#> ── [ VIBER ] My VIBER model n = 6237 (w = 6 dimensions). Fit with k = 5 clusters. ────────────────────────────────────────────────
#> ● Clusters: π = 84% [C5], 13% [C2] and 2% [C1], with π > 0.
#> ● Binomials: θ = <0.35, 0.35, 0.38, 0.4, 0.38, 0.38> [C5], <0, 0, 0.11, 0.01, 0.06, 0.16> [C2] and <0.25, 0.11, 0.14, 0, 0, 0>
#>   [C1].
#> ℹ Score(s): ELBO = -3511186.378. Fit converged in 16 steps, ε = 1e-10.
saveRDS(viber_fit, file = "./Set6/Set6_mobster_viber_fit.rds")

heuristic_fit = VIBER::choose_clusters(viber_fit, dimensions_cutoff = 1)
#> ✓ Reduced to k = 3 (from 5) selecting VIBER cluster(s) with π > 0.02, and Binomial p > 0.05 in w > 1 dimension(s).
saveRDS(heuristic_fit, file = "./Set6/Set6_mobster_viber_fit_heuristics.rds")

# Analysis without MOBSTER, plus heuristic
DPs = Set6_mutations %>% select(ends_with('DP'))
NVs = Set6_mutations %>% select(ends_with('NV'))
colnames(DPs) = colnames(NVs) = Set6_samples 

# VIBER fit
st_viber_fit = VIBER::variational_fit(x = NVs, y = DPs, samples = 1, epsilon_conv = 1e-8, max_iter = 100)
#>  [ VIBER - variational fit ]
#> ℹ Input n = 28840, with k < 10. Dirichlet concentration α = 1e-06.
#> ℹ Beta (a_0, b_0) = (1, 1); q_i = prior. Optimise: ε = 1e-08 or 100 steps, r = 1 starts.
#> [easypar] 2020-04-22 09:53:24 - Overriding parallel execution setup [TRUE] with global option : FALSE
#> ✓ VIBER fit completed in 0.5 mins (status: converged)
#> ── [ VIBER ] My VIBER model n = 28840 (w = 6 dimensions). Fit with k = 10 clusters. ──────────────────────────────────────────────
#> ● Clusters: π = 68% [C10], 21% [C6], 7% [C4], 2% [C7] and 1% [C5], with π > 0.
#> ● Binomials: θ = <0.02, 0.02, 0.02, 0.03, 0.02, 0.03> [C10], <0.34, 0.35, 0.37, 0.4, 0.37, 0.37> [C6], <0.12, 0.13, 0.12, 0.12,
#>   0.13, 0.13> [C4], <0.11, 0.09, 0.35, 0, 0, 0> [C7] and <0, 0, 0, 0.12, 0.36, 0> [C5].
#> ℹ Score(s): ELBO = -6351306.675. Fit converged in 44 steps, ε = 1e-08.
saveRDS(st_viber_fit, file = "./Set6/Set6_standard_fit.rds")

st_heuristic_fit = VIBER::choose_clusters(st_viber_fit, dimensions_cutoff = 1)
#> ✓ Reduced to k = 3 (from 10) selecting VIBER cluster(s) with π > 0.02, and Binomial p > 0.05 in w > 1 dimension(s).
#> ✓ 20284 points are now not assigned (cluster = NA) because their cluster has been removed.
saveRDS(st_heuristic_fit, file = "./Set6/Set6_standard_fit_heuristics.rds")

Without versus with MOBSTER, without the heuristic.

squareplot(
  mobster_fits = mobster_fits, 
  viber_fit_bottom = st_viber_fit, 
  viber_fit_top = viber_fit, 
  samples_list = Set6_samples,
  colors_bottom = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_viber_fit),
  colors_top = get_cluster_colors(palettes = 'FantasticFox1', viber_fit)
  )

Without versus with MOBSTER, with the heuristic.

squareplot(
  mobster_fits = mobster_fits, 
  viber_fit_bottom = st_heuristic_fit, 
  viber_fit_top = heuristic_fit, 
  samples_list = Set6_samples,
  colors_bottom = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_heuristic_fit),
  colors_top = get_cluster_colors(palettes = 'FantasticFox1', heuristic_fit)
  )